knitr::opts_chunk$set(warning = FALSE, message = FALSE, cache = T)
library(tidyverse)
library(knitr)
library(formattable)
library(skimr)
library(DT)
library(readxl)
library(xgboost)
library(SmartEDA)
library(DT)
library(tidyquant)
library(pryr)
get_path01 <- function(x){
file.path(getwd(),"Question01",x)}
get_path02 <- function(x){
file.path(getwd(),"Question01","New_FDDC_financial_data_20180524",x)}
get_path03 <- function(x){
file.path(getwd(),"Question01","New_FDDC_financial_data_20180524",
"financial data",x)}
get_path04 <- function(x){
file.path(getwd(),"ana_data",x)}
theme_ilo <- function(){
theme_minimal() +
theme(
# text = element_text(family = "Bookman", color = "gray25"),
plot.subtitle = element_text(size = 9),
plot.caption = element_text(color = "gray30"),
# plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm"),
axis.title.x = element_text(size=12,face = "bold"),
axis.title.y = element_text(size=12,face = "bold"),
# x和y的命名,要加粗,ppt才好看
axis.text.x = element_text(size=7, angle = 70, hjust = 1),
# 控制axis字体大小,7号大小最好
axis.text.y = element_text(size=7),
legend.title=element_blank()
)
}FDDC_financial_submit_20180524 <-
read_csv(get_path01("FDDC_financial_submit_20180524.csv"),
col_names = FALSE)
FDDC_financial_submit_20180524## # A tibble: 1,493 x 1
## X1
## <chr>
## 1 000001.XSHE
## 2 000005.XSHE
## 3 000006.XSHE
## 4 000009.XSHE
## 5 000010.XSHE
## 6 000011.XSHE
## 7 000012.XSHE
## 8 000014.XSHE
## 9 000017.XSHE
## 10 000018.XSHE
## # ... with 1,483 more rows
预测1493个股票的营收。
read_lines(get_path03("Balance Sheet.xls"),n_max = 5)## [1] "PK\003\004\024"
## [2] "\177\u07c9YTU,B \x95K\xacĞ\xb7\x8c\xed\x97NoQ\x95\xc9\034\002\032g3\xd1J\x9b\"\001\x9b;m\xec$\023_\xc3\xf7ƓH\x90\x94ժt\0262\xb1\004\024\xbd\xee\xedMg\xb8\xf4\x80\tW[\xccDA䟥ļ\x80Ja\xea<X\x9e\031\xbbP)\xe2\xd70\x91^\xe5S5\001\xd9n6\037e\xee,\x81\xa5\006\xd5\030\xa2\xdby\x85\xb1\x9a\x95\x94\xbc-\xf8\xf3J\xc9\xc8X\x91\xbc\xac\xd6\xd5T\x99Pޗ&W\xc4B\xe5\xdc\xea?$"
## [3] "7\036\x9b\034\xb4\xcbg\025C\xa7\xe8\003(\x8d\005"
## [4] "I_\005\xfaP\025{\x97\x8bR~\xbb0\03597M\017\x83\x9cښآ\xb4R\xc6nt\037\xe0\x8f\x8bQơua!\xb5\xbf\b|\xa2\x8e\xf6\x95踿\022\035\017\xff\xa4\x83\xf8\xcf\xf3\x8fF\x849r\020\x90\x96%९C\004=\xc6\\\xa8"
## [5] "z\a\xb7\x8c\xff\x96\xee\017"
存在编码问题。
read_excel(
get_path02("Company Operation.xls"),
# n_max = 100,
skip = 1,
sheet = "CN"
) %>%
rename(col = `指标`) %>%
distinct(`col`) %>%
filter(col %in% str_subset(col,"机场|航空")) %>%
mutate(col = str_extract(col,"^\\p{Han}{2,}")) %>%
distinct(col)## # A tibble: 6 x 1
## col
## <chr>
## 1 白云机场
## 2 东方航空
## 3 海南航空
## 4 南方航空
## 5 上海机场
## 6 深圳机场
六个目标
TICKER_SYMBOL_股票代码,END_DATE_截止日期进行合并的。finacial_data_airport_airline <- read_csv(get_path04("20180613_wu_finacial_data_airport_airline.csv"))
# read_lines(get_path04("20180613_wu_finacial_data_airport_airline.csv"),n_max = 2)这里的导出注意utf-8格式1。
finacial_data_airport_airline_edit <-
finacial_data_airport_airline %>%
select(-X1) %>%
rename_all(
funs(
str_remove_all(., "\\p{Han}{1,}") %>%
str_remove_all(., "_$") %>%
str_to_lower()
)
)这里使用REVENUE_营业收入作为y变量2。
EDA 主要参考 SmartEDA 包 测评和 Tidyverse使用技巧。
ExpData(data=finacial_data_airport_airline_edit,type=1) %>% datatable()ExpData(data=finacial_data_airport_airline_edit,type=2) %>% datatable()ExpNumStat(finacial_data_airport_airline_edit,by="A",gp="revenue",Qnt=seq(0,1,0.1),MesofShape=2,Outlier=TRUE,round=2) %>%
mutate_at(vars(Per_of_Missing),percent) %>% datatable()Vname – Variable name - 变量名称Group – Target variable -
TN – Total sample (inculded NA observations) - 样本总数nNeg – Total negative observations - 负样本数量nZero – Total zero observations - 零值数量nPos – Total positive observations - 正样本数量NegInf – Negative infinite count - 负无穷大极值PosInf – Positive infinite count - 正无穷大极值NA_value – Not Applicable count - 缺失值Per_of_Missing – Percentage of missings - 缺失率Min – minimum value - 最小值Max – maximum value - 最大值Mean – average value - 平均值Median – median value - 中位数SD – Standard deviation - 总体标准差CV – coefficient of variations (SD/mean)*100 - z scoreIQR – Inter quartile range - 四分位距 \(QD = \frac{Q3-Q1}{2}\)
Qnt – Specified quantiles - 百分位点MesofShape – Skewness and Kurtosis - 偏度和峰度Outlier – Number of outliers - 异常值数量Cor – Correlation b/w target and independent variables - 自变量和因变量相关性ExpNumViz(finacial_data_airport_airline_edit,gp=NULL,nlim=10,Page=c(2,2),sample=NULL)## $`0`
nlim: 少于10个样本不画图sample: 随机选择变量进行展示,NULL表示全部展示ExpNumViz(finacial_data_airport_airline_edit,gp="t_revenue",nlim=4,fname=NULL,col=NULL,Page=c(2,2))## $`0`
ExpCTable(finacial_data_airport_airline_edit,Target="t_revenue",margin=1,clim=10,nlim=NULL,round=2,bin=4,per=F) %>% datatable()ExpCatViz(finacial_data_airport_airline_edit,gp=NULL,fname=NULL,clim=10,col=NULL,margin=2,Page = c(2,1))## $`0`